Homework 5¶

In [2]:
import dalex as dx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
In [3]:
data = pd.read_csv('hotel_bookings.csv')
data.head()
Out[3]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults ... deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 ... No Deposit NaN NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 ... No Deposit 304.0 NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 ... No Deposit 240.0 NaN 0 Transient 98.0 0 1 Check-Out 2015-07-03

5 rows × 32 columns

In [4]:
data = data[['is_canceled', 'lead_time', 'arrival_date_year', 'adults', 'children', 'babies', 'booking_changes',
            'previous_cancellations', 'is_repeated_guest', 'arrival_date_month', 'deposit_type', 'customer_type']]
categorical_features = ['arrival_date_month', 'deposit_type', 'customer_type']
numeric_features = ['lead_time', 'arrival_date_year', 'adults', 'children', 'babies', 'booking_changes',
            'previous_cancellations', 'is_repeated_guest']
data = data.dropna()
X, y = data.loc[:, data.columns != 'is_canceled'], data[['is_canceled']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
In [5]:
categorical_transformer = OneHotEncoder()
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer(
   transformers=[
   ('categorical', categorical_transformer, categorical_features),
    ('numeric', numeric_transformer, numeric_features)
])
forest = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', RandomForestClassifier(random_state=123))
           ])

forest.fit(X_train, y_train)
print(f'ROC score: {roc_auc_score(y_test, forest.predict_proba(X_test)[:, 1])}')
ROC score: 0.8403758492684325
In [6]:
exp_forest = dx.Explainer(forest, X_train, y_train, label='random_forest')
Preparation of a new explainer is initiated

  -> data              : 107447 rows 11 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 107447 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : random_forest
  -> predict function  : <function yhat_proba_default at 0x000002271220C550> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.371, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.978, mean = -0.000466, max = 0.989
  -> model_info        : package sklearn

A new explainer has been created!
In [7]:
forest_mprofile = exp_forest.model_profile(variables = ["lead_time", "booking_changes", "children", 
                                                        "previous_cancellations"], 
                                           type = "partial")
forest_mprofile.plot()
# as usually higher lead_time leads to bigger canceletion probability
# booking changes do make probability smaller a bit if their number is > 0
# previous cancelations make cancelation more probable as expected
# number of children doesnt is not contibuting much to probability
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.10it/s]
In [8]:
forest_mprofile = exp_forest.model_profile(variables = ["lead_time", "booking_changes", "children", 
                                                        "previous_cancellations"], 
                                           type = "ale")
forest_mprofile.plot()
# ale profiles look same as partial dependence profiles
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.91it/s]
Calculating accumulated dependency: 100%|████████████████████████████████████████████████| 4/4 [00:00<00:00,  5.16it/s]
In [9]:
forest_mprofile = exp_forest.model_profile(variables = ["lead_time"],
                                           groups = "children",
                                           type = "partial")
forest_mprofile.plot()
# number of children alone may not be decisive for model, 
# but they may create groups of obsrervations, that may behave differently 

# for instance we may take a closer look at pdp curves for lead_time divided into groups by number of children
# when lead_time is high enough(> 300 days) customers with 2 children tend to cancel more often than those with 1 or 0,
# by smaller(< 300) lead_times however behavior of all groups looks similar
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.57it/s]
In [46]:
forest_mprofile = exp_forest.model_profile(variables = ["lead_time"],
                                           groups = "babies",
                                           type = "partial")
forest_mprofile.plot()
# same situation but now grouped by number of babies
# here one can observe that probability of cancelation is higher by families
# that have baby when lead_time is small enough(< 100 days)
# after that it is even more probable that family with no children will cancel its reservation
# it can happen so because families with babies plan their holidays better than families without
# but if lead_time is not big enough it can that having baby can break their plans 
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.29it/s]